package cn.itcast.job.task;

import java.io.FileOutputStream;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;

import org.jsoup.Jsoup;

import org.springframework.beans.factory.annotation.Autowired;

import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;

import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;

import cn.itcast.job.pojo.JobInfo;

@Component
public class JobProcessor implements PageProcessor {

    private String url="https://search.51job.com/list/000000,000000,0000,32%252C01,9,99,java,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";

    @Override
    public void process(Page page) {
        //解析页面，获取招聘信息详情的url地址
            List<Selectable> list= page.getHtml().css("div#resultList div.el").nodes();
        //判断获取到的集合是否为空
            if(list.size()==0){
                //如果为空，表示这是招聘详情页,解析页面，获取招聘详情信息，保存数据
                this.saveJobInfo(page);
            }else{

                for (Selectable selectable:list){
                    //获取url地址
                    String jobInfoUrl = selectable.links().toString();
                    //把获取到的url地址放到任务队列中
                    page.addTargetRequest(jobInfoUrl);
                }

                //获取下一页的url
               String bkUrl= page.getHtml().css("div.p_in li.bk").nodes().get(1).links().toString();

               // System.out.println(bkUrl);
                //把url放到任务队列中
                page.addTargetRequest(bkUrl);

            }

        //如果不为空，表示这是列表页

        String html = page.getHtml().toString();
     //   System.out.println(123);
    }

    private void saveJobInfo(Page page) {
        System.out.println(page.getUrl());
        //创建招聘详情对象
        JobInfo jobInfo=new JobInfo();
        //解析页面
        Html html=page.getHtml();
        try {
            FileOutputStream fos = new FileOutputStream("web.html",false);
            //true表示在文件末尾追加
            fos.write(html.toString().getBytes());
            fos.close();
        } catch(Exception e){
            e.printStackTrace();
        }
        //获取发布信息
        // String content = Jsoup.parse(html.css("div.cn p").regex(".*发布").toString()).text();
        //获取数据，封装到对象中
        //jobInfo.setCompanyName(html.css("div.cn p.cname a","text").toString());
        //jobInfo.setCompanyAddr(Jsoup.parse(html.css("div.bmsg").nodes().get(1).toString()).text());
        //jobInfo.setCompanyInfo(Jsoup.parse(html.css("div.tmsg").nodes().toString()).text());
        //jobInfo.setJobName(html.css("div.cn h1","text").toString());
        //jobInfo.setJobAddr(content.substring(0,2));
        //jobInfo.setJobInfo(Jsoup.parse(html.css("div.job_msg").toString()).text());
        //jobInfo.setUrl(page.getUrl().toString());
        ////获取薪资
        //Integer[] salary = MathSalary.getSalary(html.css("div.cn strong", "text").toString());
        //jobInfo.setSalaryMin(salary[0]);
        //jobInfo.setSalaryMax(salary[1]);

        //jobInfo.setTime(content.substring(content.length()-7,content.length()-2));
        // 获取数据，封装到对象中
        jobInfo.setCompanyName("compangy");
        jobInfo.setCompanyAddr("address");
        jobInfo.setCompanyInfo("companyinfo");
        jobInfo.setJobName("jobname");
        jobInfo.setJobAddr("job addr");
        jobInfo.setJobInfo("job info");
        jobInfo.setUrl(page.getUrl().toString());
        // jobInfo.setUrl("www.");
        jobInfo.setSalaryMin(10);
        jobInfo.setSalaryMax(20);

        // SimpleDateFormat formatter = new SimpleDateFormat("dd-MM-yyyy");
        SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");
        Date date = new Date(); // this object contains the current date value
        jobInfo.setTime(formatter.format(date));

        //把结果保存起来
        page.putField("jobInfo",jobInfo);


    }

    private Site site=Site.me()
            .setCharset("gbk")
            .setTimeOut(10*1000)
            .setRetrySleepTime(3000) //设置重试的间隔时间
            .setRetryTimes(3);       //设置重试的次数

    @Override
    public Site getSite() {
        return site;
    }

    @Autowired
    private SpringDataPipeline springDataPipeline;


    //initialDelay当任务启动后，等多久执行方法
    @Scheduled(initialDelay = 1000,fixedDelay = 100*1000)
    public void process(){
        Spider.create(new JobProcessor())
                .addUrl(url)
                .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(100000)))
                .thread(10)
                .addPipeline(this.springDataPipeline)
                .run();
    }
}
